import pandas as pd
import matplotlib.pyplot as plt
import cv2
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from tqdm.notebook import tqdm
from scipy.spatial.distance import cosine, chebyshev, canberra, braycurtis, euclidean
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
import seaborn as sns
from matplotlib import rcParams
import pickle
df = pd.read_csv('/home/jupyter/REPO EXPLO/Image_similarity/df_unified.csv').drop("Unnamed: 0", axis=1)
df.head()
| path_x | numint | path_y | nom_ls | gs_uri | label | lab_num | |
|---|---|---|---|---|---|---|---|
| 0 | 2023-03-10#pto#avant#0000019200133#565aa5aa477... | 19200133 | 2023-03-10#pto#apres#0000019200133#7d8f75e40f4... | 19200133_4298 | gs://ofr-vqi-data-pipeline-models-dev/notebook... | Same | 0 |
| 1 | 2023-03-13#pto#avant#0000021156692#4b2aa6c3942... | 21156692 | 2023-03-13#pto#apres#0000021156692#134ff6b528e... | 21156692_5026 | gs://ofr-vqi-data-pipeline-models-dev/notebook... | Not enough context elements | 0 |
| 2 | 2023-03-10#pto#avant#0000017998364#0dfdd44fd1d... | 17998364 | 2023-03-10#pto#apres#0000017998364#0a059bc7f01... | 17998364_3768 | gs://ofr-vqi-data-pipeline-models-dev/notebook... | Same | 0 |
| 3 | 2023-03-10#pto#avant#0000022631562#18983b6cc91... | 22631562 | 2023-03-10#pto#apres#0000022631562#5ae856a5232... | 22631562_5294 | gs://ofr-vqi-data-pipeline-models-dev/notebook... | Same | 0 |
| 4 | 2023-03-13#pto#avant#0000023208443#279f5755fb3... | 23208443 | 2023-03-13#pto#apres#0000023208443#081a25ab7af... | 23208443_5745 | gs://ofr-vqi-data-pipeline-models-dev/notebook... | Same | 0 |
len(df)
9705
df["label"].value_counts()
Same 8190 Not enough context elements 838 Not same 677 Name: label, dtype: int64
def load_image_viz(path1, path2):
img1 = cv2.cvtColor(cv2.imread("/home/jupyter/Notebooks/folder_20k/imgs_paire_pto_10k/"+path1), cv2.COLOR_BGR2RGB)
img2 = cv2.cvtColor(cv2.imread("/home/jupyter/Notebooks/folder_20k/imgs_paire_pto_10k/"+path2), cv2.COLOR_BGR2RGB)
img1 = cv2.resize(img1, (600,600))
img2 = cv2.resize(img2, (600,600))
h1, w1, _ = img1.shape
h2, w2, _ = img2.shape
height = max(h1, h2)
width = w1 + w2 + 25
result = np.ones((height, width, 3), dtype=np.uint8)*255
result[0:h1, 0:w1, :] = img1
result[0:h2, w1+25:w1 + w2 +25, :] = img2
return result
y = ("Same","Not same","Not enough context elements","Not enough context elements")
titles = ("Same", "Not same", "Not enough context elements exemple 1", "Not enough context elements exemple 2")
#Same : 6, 28, 35, 42, 63 ; Not same : 3, 8, 20 ; Not enough context elements : 0, 21, 22
idx = (42,8,6,111)
z = 0
fig, axs = plt.subplots(2, 2, figsize=(12, 8))
for i in [0,1]:
for j in [0,1]:
if i == 1 and j == 1:
axs[i, j].imshow(load_image_viz(df[df["label"].str.contains(y[z])].reset_index()['path_x'][idx[z]],
df[df["label"].str.contains(y[z])].reset_index()['path_y'][idx[z]]))
axs[i, j].set_title(titles[z])
axs[i, j].axis("off")
z+=1
else:
axs[i, j].imshow(load_image_viz(df[df["label"].str.contains(y[z])].reset_index()['path_x'][idx[z]], df[df["label"].str.contains(y[z])].reset_index()['path_y'][idx[z]]))
axs[i, j].set_title(titles[z])
axs[i, j].axis("off")
z+=1
fig.tight_layout()
y = ("Not enough context elements","Not enough context elements","Not enough context elements","Not enough context elements")
titles = ("Not enough context elements", "Not enough context elements", "Not enough context elements", "Not enough context elements")
#Same : 6, 28, 35, 42, 63 ; Not same : 3, 8, 20 ; Not enough context elements : 0, 21, 22
idx = (71,28,89,114)
z = 0
fig, axs = plt.subplots(2, 2, figsize=(12, 8))
for i in [0,1]:
for j in [0,1]:
if i == 1 and j == 1:
axs[i, j].imshow(load_image_viz(df[df["label"].str.contains(y[z])].reset_index()['path_x'][idx[z]],
df[df["label"].str.contains(y[z])].reset_index()['path_y'][idx[z]]))
axs[i, j].set_title(titles[z])
axs[i, j].axis("off")
z+=1
else:
axs[i, j].imshow(load_image_viz(df[df["label"].str.contains(y[z])].reset_index()['path_x'][idx[z]],
df[df["label"].str.contains(y[z])].reset_index()['path_y'][idx[z]]))
axs[i, j].set_title(titles[z])
axs[i, j].axis("off")
z+=1
fig.tight_layout()
Image.open("/home/jupyter/REPO EXPLO/Image_similarity/archi_dist.png")
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
resnet = models.resnet152(pretrained=True)
output = nn.Sequential(*list(resnet.children())[:-1])
output.to(device)
def load_image_torch(path):
img = cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB)
x = cv2.resize(img, (224, 224))
x = cv2.normalize(x.astype('float'), None, 0.0, 1.0, cv2.NORM_MINMAX)
#x = np.expand_dims(x, axis = 0)
return x
def prediction_avant(i):
with torch.no_grad():
img = load_image_torch("/home/jupyter/Notebooks/folder_20k/imgs_paire_pto_10k/"+df["path_x"].iloc[i])
image_tensor = transform(img).unsqueeze(0)
features = output(image_tensor.float().to(device)).squeeze().tolist()
return features
def prediction_apres(i):
with torch.no_grad():
img = load_image_torch("/home/jupyter/Notebooks/folder_20k/imgs_paire_pto_10k/"+df["path_y"].iloc[i])
image_tensor = transform(img).unsqueeze(0)
features = output(image_tensor.float().to(device)).squeeze().tolist()
return features
transform = transforms.Compose([
transforms.ToTensor()
])
dist_cos = []
dist_euc = []
dist_cheb = []
dist_can = []
dist_bray = []
for i in tqdm(range(len(df))):
pred1 = prediction_avant(i)
pred2 = prediction_apres(i)
dist_cos.append(cosine(pred1, pred2))
dist_euc.append(braycurtis(pred1, pred2))
dist_cheb.append(chebyshev(pred1, pred2))
dist_can.append(canberra(pred1, pred2))
dist_bray.append(euclidean(pred1, pred2))
DF_DIST = pd.DataFrame({"cosine":dist_cos, "braycurtis":dist_bray , "chebychev":dist_cheb,
"canberra":dist_can, "euclidean":dist_euc, "label":df["lab_num"].tolist()})
scaler = MinMaxScaler()
DF_DIST[["canberra","braycurtis"]] = scaler.fit_transform(DF_DIST[["canberra","braycurtis"]])
#DF_DIST.to_csv("REPO EXPLO/Image_similarity/DF_DIST.csv")
df_dist = pd.read_csv("/home/jupyter/REPO EXPLO/Image_similarity/DF_DIST.csv")
df_dist = df_dist.drop(["Unnamed: 0","chebyshev","cosine"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(df_dist.drop(["lab_num"], axis=1),
df_dist["lab_num"],
test_size=0.3,
stratify= df_dist["lab_num"],
random_state=42)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
clf = SVC(C=10, kernel="rbf", tol=0.0001, max_iter= 1000)
clf.fit(X_train_resampled, y_train_resampled)
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm = np.round(cm.astype('float') / cm.sum(axis=1)[:, np.newaxis], 2)
rcParams['figure.figsize'] = 8 ,5
sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', xticklabels=["Same", "Not Same"], yticklabels=["Same", "Not Same"])
<Axes: >
# with open('/home/jupyter/REPO EXPLO/Image_similarity/SVC_FINAL.pkl', 'wb') as f:
# pickle.dump(clf, f)